Text Processing Script.R

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Please determine the required text preprocessing steps using the following flag
replace_special_chars <- TRUE
remove_duplicate_chars <- TRUE
replace_numbers <- TRUE
convert_to_lower_case <- TRUE
remove_default_stopWords <- TRUE
remove_given_stopWords <- TRUE
stem_words <- TRUE
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Map 1-based optional input ports to variables
dataset1 <- maml.mapInputPort(1) # class: data.frame
# get the label and text columns from the input data set
text_column <- dataset1[["tweet_text"]]
#label_column <- dataset1[["label_column"]]

stopword_list <- NULL
result <- tryCatch({
   dataset2 <- maml.mapInputPort(2) # class: data.frame
   # get the stopword list from the second input data set
   stopword_list <- dataset2[[1]]
}, warning = function(war) {
   # warning handler
   print(paste("WARNING: ", war))
}, error = function(err) {
   # error handler
   print(paste("ERROR: ", err))
   stopword_list <- NULL
}, finally = {})

# Load the R script from the Zip port in ./src/
source("src/text.preprocessing.R");

text_column <- preprocessText(text_column,
                         replace_special_chars,
                         remove_duplicate_chars,
                         replace_numbers,
                         convert_to_lower_case,
                         remove_default_stopWords,
                         remove_given_stopWords,
                         stem_words,
                         stopword_list)
Sentinment <- dataset1[["sentiment_label"]]
data.set <- data.frame(
                Sentinment,
                text_column,
                stringsAsFactors = FALSE
                )

# Select data.frame to be sent to the output Dataset port
maml.mapOutputPort("data.set")